
import pickle
from transformers import Qwen2Tokenizer, Qwen2TokenizerFast

from constant import qwen05_pretrained_model


with open("./source.txt", "r", encoding="utf-8") as f:
    texts = f.readlines()

texts = texts[10000:12000]

text = "".join(texts)

try:
    tokenizer = Qwen2TokenizerFast.from_pretrained(qwen05_pretrained_model)
    print("using tokenizer fast")
except Exception as e:
    tokenizer = Qwen2Tokenizer.from_pretrained(qwen05_pretrained_model)

token_ids = tokenizer.encode(text)

length = 128
total_length = len(token_ids)

result = []

start = 0
# for start in range(0, total_length-length-1):
for start in range(0, 10000):
    end = start + length+1

    tokens = token_ids[start: end]
    result.append(tokens)

with open("./test.pkl", "wb") as f:
    pickle.dump(result, f)
